PilotD2Redo - QC and exploratory analysis¶
In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import os
import yaml
import scanpy as sc
import rapids_singlecell as rsc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.autolayout'] = True
# Increase all font sizes
plt.rcParams['font.size'] = 12 # Base font size
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 11
plt.rcParams['ytick.labelsize'] = 11
plt.rcParams['legend.fontsize'] = 11
from preprocess import _convert_oak_path
import qc_plots
/home/emmadann/.local/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Configuration
In [114]:
# Read config
experiment_name = 'PilotD2Redo'
config_file = '../../metadata/experiments_config.yaml'
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
config = config[experiment_name]
datadir = _convert_oak_path(config['datadir'])
sample_metadata_csv = _convert_oak_path(config['sample_metadata'])
PLOTDIR = f'../../results/{experiment_name}/'
sc.settings.figdir = PLOTDIR
def save_plot(pl_name, plot_dir = None):
if plot_dir is None:
plot_dir = PLOTDIR
plt.savefig(f'{plot_dir}/{pl_name}.pdf');
plt.savefig(f'{plot_dir}/{pl_name}.png');
Read merged dataset¶
In [129]:
sample_metadata = pd.read_csv(sample_metadata_csv, index_col=0)
sgrna_library_metadata = pd.read_csv('../../metadata/sgRNA_library_curated.csv', index_col=0)
sample_metadata
Out[129]:
| experiment_id | sample_id | donor_id | culture_condition | library_id | library_prep_kit | |
|---|---|---|---|---|---|---|
| 0 | PilotD2Redo | PilotD2Redo_Day7 | CE0010216 | Day7 | PilotD2Redo_Day7 | GEMX_flex_v2 |
| 1 | PilotD2Redo | PilotD2Redo_Rest | CE0010216 | Rest | PilotD2Redo_Rest | GEMX_flex_v2 |
| 2 | PilotD2Redo | PilotD2Redo_Restim6hr | CE0010216 | Restim6hr | PilotD2Redo_Restim6hr | GEMX_flex_v2 |
| 3 | PilotD2Redo | PilotD2Redo_Restim24hr | CE0010216 | Restim24hr | PilotD2Redo_Restim24hr | GEMX_flex_v2 |
In [130]:
adata = sc.read_h5ad(f'{datadir}/{experiment_name}_merged.gex.lognorm.h5ad')
adata.var = adata.var[['gene_ids', 'gene_name', 'mt']].copy()
adata.var_names = adata.var[ 'gene_name'].values
In [132]:
# Load assignment of sgRNAs to cells
sgrna_assignment = pd.read_csv(f'{datadir}/sgrna_assignment.csv', index_col=0)
sgrna_assignment_index = sgrna_assignment.index.tolist()
sgrna_assignment = pd.merge(sgrna_assignment, sgrna_library_metadata.rename({"sgrna_id":'guide_id'}, axis=1), how='left')
sgrna_assignment.index = sgrna_assignment_index
existing_cols = [col for col in sgrna_assignment.columns if col in adata.obs.columns]
adata.obs.drop(columns=existing_cols, inplace=True)
adata.obs = pd.concat([adata.obs, sgrna_assignment.loc[adata.obs_names]], axis=1)
adata.obs['guide_type'] = np.where(adata.obs['guide_id'].str.startswith('NTC-'), 'non-targeting', 'targeting')
QC metrics¶
In [7]:
qc_plots.plot_ncells_sample(adata);
save_plot(f'{experiment_name}_n_cells_preQC')
In [8]:
fig, axs = plt.subplots(3,1, figsize=(5,10))
for i,m in enumerate(['total_counts', 'n_genes', 'pct_counts_mt']):
sc.pl.violin(adata, m, groupby='sample_id', rotation=90, show=False, ax=axs[i]);
if i != 2:
# remove x-axis tick labels
axs[i].set_xticklabels([])
fig.tight_layout()
fig.show()
In [9]:
sc.pl.scatter(adata, 'log1p_total_counts', 'log1p_n_genes_by_counts', color='culture_condition')
In [10]:
adata.obs[['total_counts', 'n_genes', 'pct_counts_mt', 'sample_id']].groupby('sample_id').mean()
Out[10]:
| total_counts | n_genes | pct_counts_mt | |
|---|---|---|---|
| sample_id | |||
| PilotD2Redo_Day7 | 25513.193359 | 5880.084336 | 0.820162 |
| PilotD2Redo_Rest | 14707.506836 | 4779.557502 | 0.623509 |
| PilotD2Redo_Restim6hr | 13811.750000 | 4351.640373 | 0.484242 |
| PilotD2Redo_Restim24hr | 15504.541992 | 4885.655684 | 0.835122 |
Estimate fraction of low quality cells (high fraction of mitochondrial genes, low number of captured genes)
In [11]:
adata.obs['low_quality'] = (adata.obs['pct_counts_mt'] > 5) | (adata.obs['n_genes'] < 1000)
In [12]:
# plot fraction of low_quality cells for each sample_id
low_quality_df = adata.obs.groupby(['sample_id', 'culture_condition'])['low_quality'].mean().reset_index(name='fraction_low_quality')
low_quality_df
sns.barplot(data=low_quality_df, x='sample_id', y='fraction_low_quality', hue='culture_condition', dodge=False);
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
plt.xticks(rotation=90);
sgRNA assignment QC metrics¶
In [119]:
import upsetplot
from upsetplot import from_contents
In [118]:
all_crispr_a = {}
for sample_id in sample_metadata['sample_id']:
sgrna_h5ad = f"{datadir}{sample_id}.sgRNA.h5ad"
crispr_a = sc.read_h5ad(sgrna_h5ad)
all_crispr_a[sample_id] = crispr_a
In [121]:
# Get lists of inefficient and nonspecific guides for each sample
all_inefficient = {k:x.var_names[x.var['inefficient']].tolist() for k,x in all_crispr_a.items()}
all_nonspecific = {k:x.var_names[x.var['nonspecific']].tolist() for k,x in all_crispr_a.items()}
inef = from_contents(all_inefficient)
ax_dict = upsetplot.UpSet(inef, subset_size="count").plot()
nons = from_contents(all_nonspecific)
ax_dict = upsetplot.UpSet(nons).plot()
In [196]:
pl_df = adata.obs[['guide_id', 'sample_id']].copy()
pl_df['group'] = 'targeting single sgRNA'
pl_df['group'] = np.where(adata.obs['guide_id'].str.startswith('NTC'), 'NTC single sgRNA', pl_df['group'])
pl_df['group'] = np.where(adata.obs['guide_id'].isna(), 'no sgRNA (>= 3 UMIs)', pl_df['group'])
pl_df['group'] = np.where(adata.obs['guide_id'] == 'multi_sgRNA', 'multi sgRNA (>= 3 UMIs)', pl_df['group'])
# Plot barplot of number of cells in each group for each sample
group_counts = pl_df.groupby(['sample_id', 'group']).size().unstack()
group_counts.plot(kind='bar', stacked=False, figsize=(8,2))
plt.xticks(rotation=45, ha='right')
plt.ylabel('Number of cells')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
In [215]:
no_guide_cells = adata.obs_names[adata.obs['guide_id'].isna()]
multi_guide_cells = adata.obs_names[adata.obs['guide_id'] == 'multi_sgRNA']
fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for i, (sample_id, crispr_a) in enumerate(all_crispr_a.items()):
crispr_a.obs['multi_sgrna'] = crispr_a.obs_names.isin(multi_guide_cells)
crispr_a.obs['no_sgrna'] = crispr_a.obs_names.isin(no_guide_cells)
crispr_a.obs['targeting_umis'] = np.array(crispr_a[:, crispr_a.var['sgrna_type'] == 'targeting'].X.sum(axis=1)).flatten()
crispr_a.obs['ntc_umis'] = np.array(crispr_a[:, crispr_a.var['sgrna_type'] == 'NTC'].X.sum(axis=1)).flatten()
crispr_a.obs['inefficient_umis'] = np.array(crispr_a[:, crispr_a.var['inefficient']].X.sum(axis=1)).flatten()
crispr_a.obs['nonspecific_umis'] = np.array(crispr_a[:, crispr_a.var['nonspecific']].X.sum(axis=1)).flatten()
ntc_n = sum(no_sgrna_a.obs['ntc_umis'] > 3)
tot_n = no_sgrna_a.n_obs
subtitle = f'{ntc_n}/{tot_n} multi sgRNA cells with NTC sgRNAs'
no_sgrna_a = crispr_a[crispr_a.obs['multi_sgrna']].copy()
axes[i].scatter(no_sgrna_a.obs['ntc_umis'], no_sgrna_a.obs['targeting_umis'], s=3)
axes[i].set_xscale('log')
axes[i].set_yscale('log')
axes[i].set_xlabel('NTC sgRNA UMI counts')
axes[i].set_ylabel('Targeting sgRNA UMI counts')
axes[i].set_title('Multi sgRNA cells - ' + crispr_a.obs['sample_id'][0] + '\n' + subtitle)
plt.tight_layout()
plt.show()
Dimensionality reduction¶
In [13]:
sc.pl.pca(adata, annotate_var_explained=True, color=['culture_condition'], components=['1,2'], wspace=0.5, ncols=3, size=2)
In [14]:
sc.pl.pca(adata, annotate_var_explained=True, color=['sample_id', 'pct_counts_mt', 'log1p_n_genes_by_counts'], components=['1,2', '3,4', '5,6'], wspace=0.5, ncols=3, size=3)
Visualize genes with top loadings for each PC
In [15]:
sc.pl.pca_loadings(adata, components='1,2,3');
sc.pl.pca_loadings(adata, components='4,5,6')
In [226]:
sc.pl.pca(adata, annotate_var_explained=True, color=['IL2', 'MT-ND4'], components=['2,3'], wspace=0.1, ncols=3, size=2, vmax=4)
Clustering¶
In [16]:
rsc.pp.neighbors(adata, n_neighbors=50)
rsc.tl.umap(adata)
rsc.tl.louvain(adata)
In [7]:
# adata.write_h5ad()
In [21]:
sc.pl.umap(adata, color=['sample_id'] + ['log1p_total_counts','log1p_n_genes_by_counts','pct_counts_mt'], wspace=0.4, size=3)
In [22]:
qc_plots.plot_markers_sample_dotplot(adata, save=f'{experiment_name}_markers_sample_dotplot')
WARNING: saving figure to file ../../results/PilotD2Redo/dotplot_PilotD2Redo_markers_sample_dotplot.pdf
In [22]:
qc_plots.plot_markers_umap(adata, wspace=0.1, size=3, cmap='magma', ncols=5, save=f'{experiment_name}_markers.png')
WARNING: saving figure to file ../../results/PilotD2Redo/umapPilotD2Redo_markers.png